Planning considerations: - Parking accommodations - Hotel reservations/pricing packages - Race day support like Police, volunteers, space to physically queue up the racers to start the race - SWAG and timing chip orders - SWAG and bib/chip distribution - Logistics to transition to a virtual race (Do we want to consider this piece?) around shipping everything

#Libraries
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)
library(chron)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:chron':
## 
##     days, hours, minutes, seconds, years
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
CBWtableRaw <- readRDS("C:/Users/sabri/Downloads/CBWtableRaw.rds")
head(CBWtableRaw)
##       Race                  Name Age    Time Pace PiS/TiS Division PiD/TiD
## 1 1999 10M        Jane Omoro (W)  26 0:53:37 5:22  1/2358    W2529   1/559
## 2 1999 10M       Jane Ngotho (W)  29 0:53:38 5:22  2/2358    W2529   2/559
## 3 1999 10M Lidiya Grigoryeva (W)  NR 0:53:40 5:22  3/2358       NR      NR
## 4 1999 10M     Eunice Sagero (W)  20 0:53:55 5:24  4/2358    W2024   1/196
## 5 1999 10M   Alla Zhilyayeva (W)  29 0:54:08 5:25  5/2358    W2529   3/559
## 6 1999 10M    Teresa Wanjiku (W)  24 0:54:10 5:25  6/2358    W2024   2/196
##   Hometown year divisionTitle section page
## 1    Kenya 1999 Overall+Women     10M    1
## 2    Kenya 1999 Overall+Women     10M    1
## 3   Russia 1999 Overall+Women     10M    1
## 4    Kenya 1999 Overall+Women     10M    1
## 5   Russia 1999 Overall+Women     10M    1
## 6    Kenya 1999 Overall+Women     10M    1
##                                                                                                                  source
## 1 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 2 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 3 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 4 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 5 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 6 http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
##   sex
## 1   W
## 2   W
## 3   W
## 4   W
## 5   W
## 6   W
str(CBWtableRaw)
## 'data.frame':    75866 obs. of  15 variables:
##  $ Race         : chr  "1999 10M" "1999 10M" "1999 10M" "1999 10M" ...
##  $ Name         : chr  "Jane Omoro (W)" "Jane Ngotho (W)" "Lidiya Grigoryeva (W)" "Eunice Sagero (W)" ...
##  $ Age          : chr  "26" "29" "NR" "20" ...
##  $ Time         : chr  "0:53:37" "0:53:38" "0:53:40" "0:53:55" ...
##  $ Pace         : chr  "5:22" "5:22" "5:22" "5:24" ...
##  $ PiS/TiS      : chr  "1/2358" "2/2358" "3/2358" "4/2358" ...
##  $ Division     : chr  "W2529" "W2529" "NR" "W2024" ...
##  $ PiD/TiD      : chr  "1/559" "2/559" "NR" "1/196" ...
##  $ Hometown     : chr  "Kenya" "Kenya" "Russia" "Kenya" ...
##  $ year         : int  1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
##  $ divisionTitle: chr  "Overall+Women" "Overall+Women" "Overall+Women" "Overall+Women" ...
##  $ section      : chr  "10M" "10M" "10M" "10M" ...
##  $ page         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ source       : chr  "http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999" "http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999" "http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999" "http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999" ...
##  $ sex          : chr  "W" "W" "W" "W" ...

Note above I see an NR, not a na, nan or other blank. Will need to factor with NR

noresults = CBWtableRaw %>% dplyr::filter(Division == "NR")
dim(noresults)
## [1] 19 15
head(noresults)
##       Race                    Name Age    Time  Pace   PiS/TiS Division PiD/TiD
## 1 1999 10M   Lidiya Grigoryeva (W)  NR 0:53:40  5:22    3/2358       NR      NR
## 2 1999 10M        Gladys Asiba (W)  NR 0:54:50  5:29    8/2358       NR      NR
## 3 1999 10M   Connie Buckwalter (W)  NR 0:59:36  5:58   17/2358       NR      NR
## 4 1999 10M            Ann Reid (W)  NR 1:53:03 11:18 2176/2358       NR      NR
## 5 2001 10M        Loretta Cuce (W)  NR 1:53:38 11:22 2611/2972       NR      NR
## 6 2002 10M Unidentified Runner (W)  NR 1:19:45  7:59  270/3333       NR      NR
##         Hometown year divisionTitle section page
## 1         Russia 1999 Overall+Women     10M    1
## 2          Kenya 1999 Overall+Women     10M    1
## 3  Lancaster, PA 1999 Overall+Women     10M    1
## 4   Bethesda, MD 1999 Overall+Women     10M  109
## 5 Alexandria, VA 2001 Overall+Women     10M  131
## 6 Washington, DC 2002 Overall+Women     10M   14
##                                                                                                                    source
## 1   http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 2   http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 3   http://www.cballtimeresults.org/performances?division=Overall+Women&page=1&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 4 http://www.cballtimeresults.org/performances?division=Overall+Women&page=109&section=10M&sex=W&utf8=%E2%9C%93&year=1999
## 5 http://www.cballtimeresults.org/performances?division=Overall+Women&page=131&section=10M&sex=W&utf8=%E2%9C%93&year=2001
## 6  http://www.cballtimeresults.org/performances?division=Overall+Women&page=14&section=10M&sex=W&utf8=%E2%9C%93&year=2002
##   sex
## 1   W
## 2   W
## 3   W
## 4   W
## 5   W
## 6   W
# Remove no results
data = CBWtableRaw %>% 
  dplyr::filter(Division != "NR")

convert race year to factor

#looking at race year
# Participants by Year
plotdata = CBWtableRaw %>% 
  group_by(year) %>% 
  summarise(count=n()) 
## `summarise()` ungrouping output (override with `.groups` argument)
p = plotdata %>% 
  ggplot(aes(x=year, y=count)) +
  geom_line() +
  geom_point()
ggplotly(p)
p = plotdata %>% 
  ggplot(aes(x = year, y = count)) + 
  geom_bar(stat = "identity", fill = "blue")
ggplotly(p) 
# looking at counts with division splits too
plotdata = CBWtableRaw %>% 
  group_by(year, Division) %>% 
  summarise(count = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
p = plotdata %>% 
  ggplot(aes(x = year, y = count, fill = Division)) + 
  geom_bar(stat = "identity", position = "stack") 
ggplotly(p)
plotdata_by_year = CBWtableRaw %>% 
  group_by(year) %>% 
  summarise(count_year = n())
## `summarise()` ungrouping output (override with `.groups` argument)
colourCount = length(unique(CBWtableRaw$Division))
getPalette = colorRampPalette(brewer.pal(9, "Paired"))

plotdata_by_year_div = CBWtableRaw %>% 
  group_by(year, Division) %>% 
  summarise(count_year_div = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
plotdata = plotdata_by_year %>% 
  plyr::join(plotdata_by_year_div, by = "year", type = "full") %>% 
  mutate(percent = round(count_year_div/count_year*100,1))
  
p = plotdata %>% 
  ggplot(aes(x = year, y = percent, fill = Division)) + 
  geom_bar(stat = "identity", position = "stack") +  scale_fill_manual(values = getPalette(colourCount)) +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(nrow=2))

ggplotly(p) 
#heatmap on the distribution - are they getting any younger?

p = plotdata %>% 
  ggplot(aes(x = year, y = Division, fill = percent )) + 
  geom_tile(alpha=.5) + scale_fill_viridis()
ggplotly(p) 

convert time to minutes

noresults = CBWtableRaw %>% dplyr::filter(Time == "NR")
dim(noresults)
## [1]  0 15
head(noresults)
##  [1] Race          Name          Age           Time          Pace         
##  [6] PiS/TiS       Division      PiD/TiD       Hometown      year         
## [11] divisionTitle section       page          source        sex          
## <0 rows> (or 0-length row.names)
tail(CBWtableRaw)
##           Race                Name Age    Time  Pace   PiS/TiS Division
## 75861 2012 10M    Effie Harary (W)  39 2:32:08 15:13 9724/9729    W3539
## 75862 2012 10M  Khristina Nava (W)  40 2:33:11 15:19 9725/9729    W4044
## 75863 2012 10M    Geneva Dixon (W)  31 2:36:03 15:36 9726/9729    W3034
## 75864 2012 10M Veronica Eligan (W)  55 2:36:45 15:41 9727/9729    W5559
## 75865 2012 10M    Denise Bobba (W)  40 2:36:54 15:41 9728/9729    W4044
## 75866 2012 10M Rashonna Waples (W)  38 2:50:58 17:06 9729/9729    W3539
##         PiD/TiD             Hometown year divisionTitle section page
## 75861 1365/1366      Long Branch, NJ 2012 Overall+Women     10M  487
## 75862   973/974       Fort Meade, MD 2012 Overall+Women     10M  487
## 75863 2228/2228    Manassas Park, VA 2012 Overall+Women     10M  487
## 75864   236/236    Mitchellville, MD 2012 Overall+Women     10M  487
## 75865   974/974          Herndon, VA 2012 Overall+Women     10M  487
## 75866 1366/1366 District Heights, MD 2012 Overall+Women     10M  487
##                                                                                                                        source
## 75861 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
## 75862 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
## 75863 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
## 75864 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
## 75865 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
## 75866 http://www.cballtimeresults.org/performances?division=Overall+Women&page=487&section=10M&sex=W&utf8=%E2%9C%93&year=2012
##       sex
## 75861   W
## 75862   W
## 75863   W
## 75864   W
## 75865   W
## 75866   W
CBWtableRaw$Time <- chron(times=CBWtableRaw$Time) #formatting the character string for time to HH:MM:SS
sum(is.na(CBWtableRaw$Time))
## [1] 0
res <- hms(CBWtableRaw$Time)        # Identifying Hours, Minutes, Seconds
CBWtableRaw$Time = hour(res)*60 + minute(res) + second(res)/60       # convert hours to minutes, add minutes, and convert seconds to minutes and add
plotdata = CBWtableRaw %>% 
  group_by(year, Division) %>% 
  summarise(count = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
colourCount = length(unique(CBWtableRaw$Division))
getPalette = colorRampPalette(brewer.pal(9, "Paired"))

p = plotdata %>% 
  ggplot(aes(x = year, y=count, group = year, fill = Division)) + 
 geom_bar(stat = "identity") + facet_wrap(~Division) + scale_fill_manual(values = getPalette(colourCount)) +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(nrow=2))
ggplotly(p, tooltip="text") 
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p = plotdata %>% 
  ggplot(aes(x = year, y=count, group = year, fill = Division)) + 
 geom_bar(stat = "identity") + facet_wrap(~Division, scales = "free") + scale_fill_manual(values = getPalette(colourCount)) +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(nrow=2))
ggplotly(p, tooltip="text") 

boxplots for medians by year

p =  CBWtableRaw %>% 
      ggplot(aes(x = year, y = Time, group = year, color = year)) + 
     geom_boxplot()

ggplotly(p, tooltip="text") 

looking at median time by division

p = CBWtableRaw %>% 
      ggplot(aes(x = year, y = Time, group = year, color = year)) + 
     geom_boxplot() + facet_wrap(~Division)
ggplotly(p, tooltip="text") 
#predict LM


#Predict LOESS on time
plot(Time~year, ylim = c(94,100), data=CBWtableRaw, main="Year V Time")
out <- loess(Time~year, data=CBWtableRaw)
curve(predict(out, newdata=data.frame(year = x)), add=TRUE)

Run a piecewise fit next

evaluate city/state impact for recommendations for planning

#split out city and state to see in town/out of town distribution
library(dplyr)
library(tidyr)

#before <- CBWtableRaw

#unlist(strsplit(before$Hometown, "[, ]"))##Not working
#head(before, 10)